@@ -33,6 +33,8 @@ module Agents |
||
33 | 33 |
|
34 | 34 |
"@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts. You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc. Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`. |
35 | 35 |
|
36 |
+ Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document. |
|
37 |
+ |
|
36 | 38 |
When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example: |
37 | 39 |
|
38 | 40 |
"extract": { |
@@ -368,6 +368,69 @@ describe Agents::WebsiteAgent do |
||
368 | 368 |
expect(event.payload['response_info']).to eq('The reponse was 200 OK.') |
369 | 369 |
end |
370 | 370 |
|
371 |
+ describe "XML" do |
|
372 |
+ before do |
|
373 |
+ stub_request(:any, /github_rss/).to_return( |
|
374 |
+ body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")), |
|
375 |
+ status: 200 |
|
376 |
+ ) |
|
377 |
+ |
|
378 |
+ @checker = Agents::WebsiteAgent.new(name: 'github', options: { |
|
379 |
+ 'name' => 'GitHub', |
|
380 |
+ 'expected_update_period_in_days' => '2', |
|
381 |
+ 'type' => 'xml', |
|
382 |
+ 'url' => 'http://example.com/github_rss.atom', |
|
383 |
+ 'mode' => 'on_change', |
|
384 |
+ 'extract' => { |
|
385 |
+ 'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' }, |
|
386 |
+ 'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' }, |
|
387 |
+ 'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' }, |
|
388 |
+ } |
|
389 |
+ }, keep_events_for: 2) |
|
390 |
+ @checker.user = users(:bob) |
|
391 |
+ @checker.save! |
|
392 |
+ end |
|
393 |
+ |
|
394 |
+ it "works with XPath" do |
|
395 |
+ expect { |
|
396 |
+ @checker.check |
|
397 |
+ }.to change { Event.count }.by(20) |
|
398 |
+ event = Event.last |
|
399 |
+ expect(event.payload['title']).to eq('Shift to dev group') |
|
400 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af') |
|
401 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30') |
|
402 |
+ end |
|
403 |
+ |
|
404 |
+ it "works with CSS selectors" do |
|
405 |
+ @checker.options['extract'] = { |
|
406 |
+ 'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' }, |
|
407 |
+ 'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' }, |
|
408 |
+ 'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' }, |
|
409 |
+ } |
|
410 |
+ @checker.save! |
|
411 |
+ expect { |
|
412 |
+ @checker.check |
|
413 |
+ }.to change { Event.count }.by(20) |
|
414 |
+ event = Event.last |
|
415 |
+ expect(event.payload['title']).to be_empty |
|
416 |
+ expect(event.payload['thumbnail']).to be_empty |
|
417 |
+ |
|
418 |
+ @checker.options['extract'] = { |
|
419 |
+ 'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./xmlns:title)' }, |
|
420 |
+ 'url' => { 'css' => 'feed > entry', 'value' => './xmlns:link[1]/@href' }, |
|
421 |
+ 'thumbnail' => { 'css' => 'feed > entry', 'value' => './media:thumbnail/@url' }, |
|
422 |
+ } |
|
423 |
+ @checker.save! |
|
424 |
+ expect { |
|
425 |
+ @checker.check |
|
426 |
+ }.to change { Event.count }.by(20) |
|
427 |
+ event = Event.last |
|
428 |
+ expect(event.payload['title']).to eq('Shift to dev group') |
|
429 |
+ expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af') |
|
430 |
+ expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30') |
|
431 |
+ end |
|
432 |
+ end |
|
433 |
+ |
|
371 | 434 |
describe "JSON" do |
372 | 435 |
it "works with paths" do |
373 | 436 |
json = { |